{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Wrangle Wiki Toxic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "split = \"test\"\n",
    "df = pd.read_parquet(\n",
    "    f\"https://huggingface.co/api/datasets/OxAISH-AL-LLM/wiki_toxic/parquet/default/{split}/0.parquet\"\n",
    ")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"toxic\"] = df[\"label\"].map(bool)\n",
    "df = df.drop(columns=[\"label\"])\n",
    "df = df.rename(columns={\"comment_text\": \"text\"})\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_json(f\"wiki_toxic-{split}.jsonl\", orient=\"records\", lines=True)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
